Step 0: Install and load Libraries
packages.used=c("rvest", "tibble", "qdap",
"sentimentr", "dplyr",
"syuzhet", "factoextra",
"beeswarm", "RColorBrewer",
"RANN", "topicmodels",
"quanteda", "tidytext", "tidyverse", "scales",
"stringr", "gplots", "wordcloud",
"tm", "viridis", "grid",
"testthat","magrittr","sysfonts","showtext","magick","apcluster")
# check packages that need to be installed.
packages.needed=setdiff(packages.used,
intersect(installed.packages()[,1],
packages.used))
# install additional packages
if(length(packages.needed)>0){
install.packages(packages.needed, dependencies = TRUE)
}
# load packages
library("rvest")
## Loading required package: xml2
library("tibble")
library("qdap")
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Loading required package: qdapTools
## Loading required package: RColorBrewer
##
## Attaching package: 'qdap'
## The following object is masked from 'package:rvest':
##
## %>%
## The following object is masked from 'package:base':
##
## Filter
library("sentimentr")
library("gplots")
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library("dplyr")
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:qdap':
##
## %>%
## The following object is masked from 'package:qdapTools':
##
## id
## The following objects are masked from 'package:qdapRegex':
##
## escape, explain
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("tm")
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:qdap':
##
## ngrams
##
## Attaching package: 'tm'
## The following objects are masked from 'package:qdap':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
library("syuzhet")
##
## Attaching package: 'syuzhet'
## The following object is masked from 'package:sentimentr':
##
## get_sentences
library("factoextra")
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## The following object is masked from 'package:qdapRegex':
##
## %+%
library("beeswarm")
library("scales")
##
## Attaching package: 'scales'
## The following object is masked from 'package:syuzhet':
##
## rescale
library("RColorBrewer")
library("RANN")
library("tm")
library("topicmodels")
source("../lib/plotstacked.R")
source("../lib/speechFuncs.R")
library("quanteda")
## quanteda version 0.9.9.17
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:NLP':
##
## ngrams
## The following objects are masked from 'package:qdap':
##
## as.DocumentTermMatrix, as.wfm, ngrams, weight
## The following object is masked from 'package:utils':
##
## View
## The following object is masked from 'package:base':
##
## sample
library("tidyverse")
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Conflicts with tidy packages ----------------------------------------------
## %+%(): ggplot2, qdapRegex
## annotate(): ggplot2, NLP
## col_factor(): readr, scales
## col_numeric(): readr, scales
## discard(): purrr, scales
## escape(): dplyr, qdapRegex
## explain(): dplyr, qdapRegex
## filter(): dplyr, stats
## id(): dplyr, qdapTools
## lag(): dplyr, stats
## tokenize(): readr, quanteda
library("tidytext")
library("scales")
library("stringr")
##
## Attaching package: 'stringr'
## The following object is masked from 'package:qdap':
##
## %>%
library("wordcloud")
##
## Attaching package: 'wordcloud'
## The following object is masked from 'package:gplots':
##
## textplot
library("tm")
library("viridis")
library("grid")
library("testthat")
##
## Attaching package: 'testthat'
## The following object is masked from 'package:purrr':
##
## is_null
## The following object is masked from 'package:dplyr':
##
## matches
## The following object is masked from 'package:qdap':
##
## %>%
library("magrittr")
##
## Attaching package: 'magrittr'
## The following objects are masked from 'package:testthat':
##
## equals, is_less_than, not
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:qdap':
##
## %>%
library("sysfonts")
library("showtext")
library("magick")
Step 1: Read in Speeches & Text Processing
# Create a named vector of colours for representing the different parties, using
# later in both ggplot and base graphics (wordcloud).
# The Democratic-Republican party colours were red, white and blue but I need something distinctive.
parties <- c(None = "grey50", Federalist = "black", `Democratic-Republican` = "darkgreen",
Whig = "orange", Republican = "red", Democrat = "blue")
# Get the Trump speech
# originally copied this from http://abcnews.go.com/Politics/full-text-president-donald-trumps-inauguration-speech/story?id=44915821
trump <- paste(readLines("https://raw.githubusercontent.com/ellisp/ellisp.github.io/source/data/trump_inauguration.txt"),
collapse = " ")
trump_df <- data_frame(fulltext = trump,
inauguration = "2017-Trump")
# Combine with all the other speeches, and break down into tokens (words),
# one row per word, in order:
inaugural <- data_frame(fulltext = data_char_inaugural,
inauguration = names(data_char_inaugural)) %>%
rbind(trump_df) %>%
mutate(year = as.numeric(str_sub(inauguration, 1, 4)),
president = str_sub(inauguration, start = 6)) %>%
unnest_tokens(word, fulltext, token = "words") %>%
group_by(inauguration) %>%
mutate(sequence = 1:n())
# aggregate / count how many occurances of word in each speech:
words <- inaugural %>%
group_by(inauguration, word, year, president) %>%
summarise(count = n()) %>%
bind_tf_idf(word, inauguration, count) %>%
ungroup()
# note - sentiment matching was very poor. Many words with obvious sentiment
# are missed. so not doing any sentiment analysis for now.
expect_equal(nrow(inaugural), sum(words$count))
# combine with the total count each word used in all speeches:
all_usage <- words %>%
group_by(word) %>%
summarise(total_count = sum(count)) %>%
arrange(desc(total_count))
expect_equal(sum(all_usage$total_count), sum(words$count))
words <- words %>%
left_join(all_usage, by = "word")
# vector of all inaugurations (eg '1961-Kennedy'), use later for looping through:
inaugs <- unique(inaugural$inauguration)
Step 2: A economic Study based on times series of frequency
# time series: # of times the selected word used as percentage of all words
# Original Source http://www.enchantedlearning.com/history/us/pres/list.shtml and re-typed
presidents <- read.csv("https://raw.githubusercontent.com/ellisp/ellisp.github.io/source/data/presidents.csv",
skip = 3, stringsAsFactors = FALSE) %>%
filter(!is.na(year)) %>%
select(inauguration, party)
# You can change the word you want to filter here
annotations <- data_frame(word = c("america", "economy", "tax", "job"),
lab = c("Trump: American-First, Populism, Protectionism",
"The Famous Reganomics, Roosevelt: Great Depression",
"Coolige: Well-known Low Taxation Policy",
"Trump: Bring Production Back to America"
),
y = c(.5, .5, .5, .5) / 100
)
words %>%
mutate(word = ifelse(grepl("americ", word), "america", word),
word = ifelse(grepl("econo", word), "economy", word),
word = ifelse(grepl("employ", word), "job", word),
word = ifelse(grepl("tax", word), "tax", word),
word = ifelse(grepl("job", word), "job", word)) %>%
group_by(inauguration, president, year, word) %>%
summarise(count = sum(count)) %>%
group_by(inauguration, president, year) %>%
mutate(relative_count = count / sum(count)) %>%
filter(word %in% c("america", "economy", "tax", "job")) %>%
left_join(presidents, by = "inauguration") %>%
ggplot(aes(x = year, y = relative_count, label = president)) +
geom_text(size = 3, aes(colour = party)) +
facet_wrap(~word, ncol = 1, scales = "free_y") +
ggtitle("Time Series Plot",
"Presidents labelled if they used the word or a variant.") +
labs(x = "", y = "Number of times used as a percentage of all words", caption = "http://ellisp.github.io") +
scale_colour_manual("", values = parties) +
scale_y_continuous(label = percent) +
geom_text(data = annotations, x = 1935, aes(y = y, label = lab), colour = "grey50", hjust = 1) +
theme(strip.text = element_text(size = 15, face = "bold"))
America: Trump tops, which conincides with his policy, American First, protectionism, stablizing the power of US, bringing production back to America. That’s why word “America” has been used a lot.
Economy: Reagan is known for “Reaganomics”, Roosevelt directed the U.S. government in the era of “Great Depression” https://en.wikipedia.org/wiki/Franklin_D._Roosevelt https://en.wikipedia.org/wiki/Reaganomics
Job: Trump aims at increasing the employment rate in the U.S. and bringing the jobs taken by foreign workers back to U.S. citizens. So he mentioned a lot about job and employment
Tax: Coolidge is known for his idea of “scientific taxation”. He made the tax cut and heavily reduced income tax rates. https://en.wikipedia.org/wiki/Calvin_Coolidge#Taxation_and_government_spending
Step 3: Unique words used in each president’s speech
# We choose 1933-Roosevelt, 1949-Truman, 2009-Obama, 2017-Trump
# to analyze the unique words in their speech
set.seed(123)
par(family = "myfont", bg = "black", mfrow = c(2, 2))
our_inaugs <- c("1933-Roosevelt", "1949-Truman", "2009-Obama", "2017-Trump")
cols <- c("steelblue", "darkred", "steelblue", "darkred")
for(i in 1:length(our_inaugs)){
newwords <- words %>%
filter(total_count == count) %>%
filter(inauguration == our_inaugs[[i]]) %$%
wordcloud(words = .$word,
colors = terrain.colors(15), random.color = TRUE, scale = c(1.1, 0.9))
title(main = gsub("-", " ", our_inaugs[[i]]), col.main = cols[i])
}
1933 is Great Depression. Hence, there are many passive words, like “tearfully”, “mad”, “languish”, as well as some encouraging words, like “evoke”, “unify”, “changers”
Compared with previous presidents, Trump’s words are aggressive and frank
For Truman, he took over the president at the Cold War Time. The words “communist”, “hostilities” are reasonably been mentioned as the first time in the presidential speech.
Step 4: Distinctive Analysis
# Identify words which are most distinctive to a particular Presidential speech
# These words might be used elsewhere, but are most characteristic of a particular speech.
# The bigger the size is, the more frequently the words have been mentioned
for(i in 1:length(inaugs)){
the_party <- presidents[presidents$inauguration == inaugs[[i]], "party"]
# create palette of colours, suitable for the particular party
palfun <- colorRampPalette(c("white", parties[the_party]))
}
#1949-Truman
the_data <- subset(words, inauguration == inaugs[[41]]) %>%
arrange(desc(tf_idf)) %>%
slice(1:80) %>%
arrange(tf_idf) %>%
mutate(fading_colour = palfun(80))
wordcloud(the_data$word,
freq = the_data$tf_idf * max(the_data$tf_idf)* 50,
colors = the_data$fading_colour,
scale = c(6 * max(the_data$tf_idf) / max(words$tf_idf), 0.5),
random.order = FALSE, random.color = FALSE, rot.per = 0)
"1933-Roosevelt"
## [1] "1933-Roosevelt"
the_data <- subset(words, inauguration == inaugs[[37]]) %>%
arrange(desc(tf_idf)) %>%
slice(1:80) %>%
arrange(tf_idf) %>%
mutate(fading_colour = palfun(80))
wordcloud(the_data$word,
freq = the_data$tf_idf * max(the_data$tf_idf)* 50,
colors = the_data$fading_colour,
scale = c(6 * max(the_data$tf_idf) / max(words$tf_idf), 0.5),
random.order = FALSE, random.color = FALSE, rot.per = 0)
"2017-Trump"
## [1] "2017-Trump"
the_data <- subset(words, inauguration == inaugs[[58]]) %>%
arrange(desc(tf_idf)) %>%
slice(1:80) %>%
arrange(tf_idf) %>%
mutate(fading_colour = palfun(80))
wordcloud(the_data$word,
freq = the_data$tf_idf * max(the_data$tf_idf)* 50,
colors = the_data$fading_colour,
scale = c(6 * max(the_data$tf_idf) / max(words$tf_idf), 0.5),
random.order = FALSE, random.color = FALSE, rot.per = 0)
As I said, Truman took over the president at the Cold War Time. You can easily see which words have been mentioned frequently
1933 is Great Depression. “Helped”, “emergency”, “money”, “recovery” been mentioned a lot.
Trump holds totally different policy against “Obama”. “job”, “back”, “america”, these words conincide with Trump’s policy as I suggested before.
Step 1: Data harvest: scrap speech URLs from http://www.presidency.ucsb.edu/.
Following the example of Jerid Francom, we used Selectorgadget to choose the links we would like to scrap. For this project, we selected all inaugural addresses of past presidents, nomination speeches of major party candidates and farewell addresses. We also included several public speeches from Donald Trump for our textual analysis of presidential speeches.
### Inauguaral speeches
main.page <- read_html(x = "http://www.presidency.ucsb.edu/inaugurals.php")
# Get link URLs
# f.speechlinks is a function for extracting links from the list of speeches.
inaug=f.speechlinks(main.page)
#head(inaug)
as.Date(inaug[,1], format="%B %e, %Y")
## [1] "1789-04-30" "1793-03-04" "1797-03-04" "1801-03-04" "1805-03-04"
## [6] "1809-03-04" "1813-03-04" "1817-03-04" "1821-03-04" "1825-03-04"
## [11] "1829-03-04" "1833-03-04" "1837-03-04" "1841-03-04" "1845-03-04"
## [16] "1849-03-05" "1853-03-04" "1857-03-04" "1861-03-04" "1865-03-04"
## [21] "1869-03-04" "1873-03-04" "1877-03-05" "1881-03-04" "1885-03-04"
## [26] "1889-03-04" "1893-03-04" "1897-03-04" "1901-03-04" "1905-03-04"
## [31] "1909-03-04" "1913-03-04" "1917-03-04" "1921-03-04" "1925-03-04"
## [36] "1929-03-04" "1933-03-04" "1937-01-20" "1941-01-20" "1945-01-20"
## [41] "1949-01-20" "1953-01-20" "1957-01-21" "1961-01-20" "1965-01-20"
## [46] "1969-01-20" "1973-01-20" "1977-01-20" "1981-01-20" "1985-01-21"
## [51] "1989-01-20" "1993-01-20" "1997-01-20" "2001-01-20" "2005-01-20"
## [56] "2009-01-20" "2013-01-21" "2017-01-20" NA
inaug=inaug[-nrow(inaug),] # remove the last line, irrelevant due to error.
#### Nomination speeches
main.page=read_html("http://www.presidency.ucsb.edu/nomination.php")
# Get link URLs
nomin <- f.speechlinks(main.page)
#head(nomin)
#
#### Farewell speeches
main.page=read_html("http://www.presidency.ucsb.edu/farewell_addresses.php")
# Get link URLs
farewell <- f.speechlinks(main.page)
#head(farewell)
inaug.list=read.csv("../data/inauglist.csv", stringsAsFactors = FALSE)
nomin.list=read.csv("../data/nominlist.csv", stringsAsFactors = FALSE)
farewell.list=read.csv("../data/farewelllist.csv", stringsAsFactors = FALSE)
speech.list=rbind(inaug.list, nomin.list, farewell.list)
speech.list$type=c(rep("inaug", nrow(inaug.list)),
rep("nomin", nrow(nomin.list)),
rep("farewell", nrow(farewell.list)))
speech.url=rbind(inaug, nomin, farewell)
speech.list=cbind(speech.list, speech.url)
Based on the list of speeches, we scrap the main text part of the transcript’s html page. For simple html pages of this kind, Selectorgadget is very convenient for identifying the html node that rvest can use to scrap its content. For reproducibility, we also save our scrapped speeches into our local folder as individual speech files.
# Loop over each row in speech.list
speech.list$fulltext=NA
for(i in seq(nrow(speech.list))) {
text <- read_html(speech.list$urls[i]) %>% # load the page
html_nodes(".displaytext") %>% # isloate the text
html_text() # get the text
speech.list$fulltext[i]=text
# Create the file name
filename <- paste0("../data/fulltext/",
speech.list$type[i],
speech.list$File[i], "-",
speech.list$Term[i], ".txt")
sink(file = filename) %>% # open file to write
cat(text) # write the file
sink() # close the file
}
Trump, as president-elect that has not been a politician, do not have a lot of formal speeches yet. For our textual analysis, we manually add several public transcripts from Trump: + [Transcript: Donald Trump’s full immigration speech, annotated. LA Times, 08/31/2016] (http://www.latimes.com/politics/la-na-pol-donald-trump-immigration-speech-transcript-20160831-snap-htmlstory.html) + Transcript of Donald Trump’s speech on national security in Philadelphia - The Hill, 09/07/16 + Transcript of President-elect Trump’s news conference CNBC, 01/11/2017
speech1=paste(readLines("../data/fulltext/SpeechDonaldTrump-NA.txt",
n=-1, skipNul=TRUE),
collapse=" ")
speech2=paste(readLines("../data/fulltext/SpeechDonaldTrump-NA2.txt",
n=-1, skipNul=TRUE),
collapse=" ")
speech3=paste(readLines("../data/fulltext/PressDonaldTrump-NA.txt",
n=-1, skipNul=TRUE),
collapse=" ")
Trump.speeches=data.frame(
President=rep("Donald J. Trump", 3),
File=rep("DonaldJTrump", 3),
Term=rep(0, 3),
Party=rep("Republican", 3),
Date=c("August 31, 2016", "September 7, 2016", "January 11, 2017"),
Words=c(word_count(speech1), word_count(speech2), word_count(speech3)),
Win=rep("yes", 3),
type=rep("speeches", 3),
links=rep(NA, 3),
urls=rep(NA, 3),
fulltext=c(speech1, speech2, speech3)
)
speech.list=rbind(speech.list, Trump.speeches)
We will use sentences as units of analysis for this project, as sentences are natural languge units for organizing thoughts and ideas. For each extracted sentence, we apply sentiment analysis using NRC sentiment lexion. “The NRC Emotion Lexicon is a list of English words and their associations with eight basic emotions (anger, fear, anticipation, trust, surprise, sadness, joy, and disgust) and two sentiments (negative and positive). The annotations were manually done by crowdsourcing.”
We assign an sequential id to each sentence in a speech (sent.id) and also calculated the number of words in each sentence as sentence length (word.count).
sentence.list=NULL
for(i in 1:nrow(speech.list)){
sentences=sent_detect(speech.list$fulltext[i],
endmarks = c("?", ".", "!", "|",";"))
if(length(sentences)>0){
emotions=get_nrc_sentiment(sentences)
word.count=word_count(sentences)
# colnames(emotions)=paste0("emo.", colnames(emotions))
# in case the word counts are zeros?
emotions=diag(1/(word.count+0.01))%*%as.matrix(emotions)
sentence.list=rbind(sentence.list,
cbind(speech.list[i,-ncol(speech.list)],
sentences=as.character(sentences),
word.count,
emotions,
sent.id=1:length(sentences)
)
)
}
}
Some non-sentences exist in raw data due to erroneous extra end-of sentence marks.
sentence.list=
sentence.list%>%
filter(!is.na(word.count))
# In this analysis, we use Affinity Propagation (Frey and Dueck) to cluster presidents based on emotions
# Affinity Propagation is based on the message passing between data points, it does not require the input of # of clusters, and it also simultaneous considers all data points as exemplars
# It brings more and more attention because of its good performance
# However, you can also choose # of clusters to force APcluster generate k clusters
# For more details:https://cran.r-project.org/web/packages/apcluster/vignettes/apcluster.pdf
sel.comparison=c("DonaldJTrump","JohnMcCain", "GeorgeBush", "MittRomney", "GeorgeWBush",
"RonaldReagan","AlbertGore,Jr", "HillaryClinton","JohnFKerry",
"WilliamJClinton","HarrySTruman", "BarackObama", "LyndonBJohnson",
"GeraldRFord", "JimmyCarter", "DwightDEisenhower", "FranklinDRoosevelt",
"HerbertHoover","JohnFKennedy","RichardNixon","WoodrowWilson",
"AbrahamLincoln", "TheodoreRoosevelt", "JamesGarfield",
"JohnQuincyAdams", "UlyssesSGrant", "ThomasJefferson",
"GeorgeWashington", "WilliamHowardTaft", "AndrewJackson",
"WilliamHenryHarrison", "JohnAdams")
presid.summary=tbl_df(sentence.list)%>%
filter(type=="nomin", File%in%sel.comparison)%>%
#group_by(paste0(type, File))%>%
group_by(File)%>%
summarise(
anger=mean(anger),
anticipation=mean(anticipation),
disgust=mean(disgust),
fear=mean(fear),
joy=mean(joy),
sadness=mean(sadness),
surprise=mean(surprise),
trust=mean(trust)
#negative=mean(negative),
#positive=mean(positive)
)
presid.summary=as.data.frame(presid.summary)
rownames(presid.summary)=as.character((presid.summary[,1]))
library("apcluster")
##
## Attaching package: 'apcluster'
## The following object is masked from 'package:quanteda':
##
## similarity
## The following object is masked from 'package:stats':
##
## heatmap
#Generate similarity matrix
s1<-negDistMat(presid.summary[,-1],r=2)
#Run affinity propagation
apres<-apcluster(s1)
#Show results
show(apres)
##
## APResult object
##
## Number of samples = 23
## Number of iterations = 127
## Input preference = -0.0003596946
## Sum of similarities = -0.001622897
## Sum of preferences = -0.002517862
## Net similarity = -0.00414076
## Number of clusters = 7
##
## Exemplars:
## AbrahamLincoln GeorgeWBush HerbertHoover JimmyCarter JohnFKennedy
## LyndonBJohnson RonaldReagan
## Clusters:
## Cluster 1, exemplar AbrahamLincoln:
## AbrahamLincoln
## Cluster 2, exemplar GeorgeWBush:
## GeorgeWBush
## Cluster 3, exemplar HerbertHoover:
## DonaldJTrump GeraldRFord HerbertHoover JohnMcCain WilliamHowardTaft
## Cluster 4, exemplar JimmyCarter:
## AlbertGore,Jr DwightDEisenhower GeorgeBush JimmyCarter RichardNixon
## WilliamJClinton
## Cluster 5, exemplar JohnFKennedy:
## BarackObama FranklinDRoosevelt HarrySTruman HillaryClinton
## JohnFKennedy JohnFKerry WoodrowWilson
## Cluster 6, exemplar LyndonBJohnson:
## LyndonBJohnson MittRomney
## Cluster 7, exemplar RonaldReagan:
## RonaldReagan
#heatmap
heatmap(apres,s1)
#scatter plot matrices
plot(apres,presid.summary[,-1])
1.The results give us the # of clusters and exemplars in each cluster
Heat map can help us the relationships of the emotion of each president, which clusters are adjacent and close to each other
Scatter plot matrices: The variables are written in a diagonal line from top left to bottom right. Then each variable is plotted against each other. For example, the (1,2) graph is an individual scatterplot of anger & anticipation
#References: https://ellisp.github.io/blog/2017/01/23/inaugural-speeches
#I used functions and followed steps from Peter's Stats Stuff
#Also, I used part of the codes of Professor's Tutorial 2